library(foreign)
library(data.table)
library(tidyr)
library(plyr)
library(dplyr)
library(xtable)
library(ggplot2)
library(scales)
library(grid)
library(gridExtra)
library(tikzDevice)
library(stringr)

# clear memory.
rm(list=ls())
gc()

# get merged data (Form A and Form 2)
d <- read.csv(file.path(fldr_cleaned_data, "formA_form2_policy_year_level.csv"), as.is = TRUE)

# sample: individual policies only
d.sample1 <- filter(d, in_form2==1, type=="individual")

#===================================================================================
# number of available plans and active insurers by year

export2tex = 1
prefix1    = "individual_plans_"

# compute the number of plans/insurers for years in first_year_issue
year_vec <- sort(unique(d.sample1$unique_first_year_issue))

# bring down to plan level
temp <- distinct(d.sample1[, c("naic_code", "planid", "policy_form", "exit_by_today", "unique_first_year_issue", "max_report_year", "exit_year")], 
                 naic_code, policy_form, .keep_all = TRUE)

# create a data frame where for each year, the number of available plans and insurers would be saved
output <- data.frame(year_vec, "active_plans"=NA, "active_insurers"=NA)

for (yy in year_vec) {
  
  # plans that are available in year==yy
  available_plans <- temp[(temp$exit_by_today==1 & temp$unique_first_year_issue<=yy & temp$exit_year>=yy) | (temp$exit_by_today==0 & temp$unique_first_year_issue<=yy), c("planid", "naic_code")]
  
  # in dataframe output, store the number of available plans and active insurers
  output[output$year_vec==yy, "active_plans"]    = nrow(available_plans)  # the same as length(unique(available_plans$planid))
  output[output$year_vec==yy, "active_insurers"] = nrow(distinct(available_plans, naic_code))
}

rm(year_vec, temp, available_plans)

# figure properties
f_width   = 3
f_height  = 3
text_size = 8

# Figure 1
if (export2tex==1){
  tikz(file=paste0(fldr_table, prefix1, "hist_available_plans_insurers_year.tex"), f_width, f_height)
}

ggplot(data = output, aes(x=year_vec, y=active_plans)) + 
  geom_bar(stat="identity", color="black", fill="gray") +
  xlab("Year") +
  ylab("Plans on the market for sales") +
  theme(text = element_text(size=text_size)) +
  scale_x_continuous(breaks=c(min(output$year_vec), 1985, 1995, 2003, 2010, 2016))

ggplot(data = output, aes(x=year_vec, y=active_insurers)) + 
  geom_bar(stat="identity", color="black", fill="gray") +
  xlab("Year") +
  ylab("Insurers selling plans") +
  theme(text = element_text(size=text_size)) +
  scale_x_continuous(breaks=c(min(output$year_vec), 1985, 1995, 2003, 2010, 2016))

#===================================================================================
# number of exiting plans by year

# bring data containing exiting plans to the plan level
temp1 <- distinct(filter(d.sample1, exit_by_today==1), naic_code, policy_form, .keep_all = TRUE)

# for each year, compute the number of exiting plans
temp2        <- ddply(temp1, "exit_year", summarise, Plans = length(naic_code))
names(temp2) <- c("year", "exiting_plans")

# for each year, get the number of available plans from output dataframe
temp3 <- output
names(temp3)[1] <- "year"

# for each year, store the number of exiting plans & available plans
temp4 <- full_join(temp3, temp2, by="year")
temp4[is.na(temp4$exiting_plans), "exiting_plans"]<-0
temp4$exit_rate <- temp4$exiting_plans/temp4$active_plans

# figure properties
f_width   = 3
f_height  = 3
text_size = 8

# Figure 2
if (export2tex==1){
  tikz(file=paste0(fldr_table, prefix1, "hist_exit_year.tex"), f_width, f_height)
}

ggplot(data = filter(temp2, year<2016), aes(x=year, y=exiting_plans)) + 
  geom_point(size=2) +
  geom_line() +
  xlab("Year") +
  ylab("Number of plans that exit") +
  theme(text = element_text(size=text_size)) +
  scale_x_continuous(breaks=c(min(temp2$year), 1990, 2003, 2010, 2016))

ggplot(data = filter(temp4, year<2016), aes(x=year, y=exit_rate)) + 
  geom_point(size=2) +
  geom_line() +
  xlab("Year") +
  ylab("Plan exit rate = exiting plans/available plans") +
  theme(text = element_text(size=text_size)) +
  scale_x_continuous(breaks=c(min(temp4$year), min(temp2$year), 1990,  2003,  2015))


